// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.

// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#ifndef _H_MMM_12X8
#define _H_MMM_12X8

template <typename T = F16>
void mmm_12x8(U64 offset, U64 K4, F16 *A, F16 *B, F16 *C)
{
    __asm__ __volatile__("ld1 {v24.8h, v25.8h}, [%[A]]\n"
                         "ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [%[B]]\n"
                         "mov x20, %[A]\n"
                         "mov x21, %[B]\n"
                         "mov x22, %[K]\n"

                         "movi v0.16b, #0x0\n"
                         "movi v1.16b, #0x0\n"
                         "movi v2.16b, #0x0\n"
                         "movi v3.16b, #0x0\n"
                         "movi v4.16b, #0x0\n"
                         "movi v5.16b, #0x0\n"
                         "movi v6.16b, #0x0\n"
                         "movi v7.16b, #0x0\n"
                         "movi v8.16b, #0x0\n"
                         "movi v9.16b, #0x0\n"
                         "movi v10.16b, #0x0\n"
                         "movi v11.16b, #0x0\n"
                         "movi v12.16b, #0x0\n"
                         "movi v13.16b, #0x0\n"
                         "movi v14.16b, #0x0\n"
                         "movi v15.16b, #0x0\n"
                         "movi v16.16b, #0x0\n"
                         "movi v17.16b, #0x0\n"
                         "movi v18.16b, #0x0\n"
                         "movi v19.16b, #0x0\n"
                         "movi v20.16b, #0x0\n"
                         "movi v21.16b, #0x0\n"
                         "movi v22.16b, #0x0\n"
                         "movi v23.16b, #0x0\n"

                         "cmp x22, #1\n"
                         "ble 1f\n"

                         "0:\n"
                         ".inst 0x6e5cef00  // bfmmla v0.4s, v24.8h, v28.8h\n"
                         ".inst 0x6e5def01  // bfmmla v1.4s, v24.8h, v29.8h\n"
                         ".inst 0x6e5cef24  // bfmmla v4.4s, v25.8h, v28.8h\n"
                         ".inst 0x6e5def25  // bfmmla v5.4s, v25.8h, v29.8h\n"
                         "ldr q26, [x20, 0x20]\n"
                         "ldr q27, [x20, 0x30]\n"
                         ".inst 0x6e5eef02  // bfmmla v2.4s, v24.8h, v30.8h\n"
                         ".inst 0x6e5fef03  // bfmmla v3.4s, v24.8h, v31.8h\n"
                         "ldr q24, [x20, 0x40]\n"
                         ".inst 0x6e5eef26  // bfmmla v6.4s, v25.8h, v30.8h\n"
                         ".inst 0x6e5fef27  // bfmmla v7.4s, v25.8h, v31.8h\n"
                         ".inst 0x6e5cef48  // bfmmla v8.4s, v26.8h, v28.8h\n"
                         "ldr q25, [x20, 0x50]\n"
                         ".inst 0x6e5def49  // bfmmla v9.4s, v26.8h, v29.8h\n"
                         ".inst 0x6e5eef4a  // bfmmla v10.4s, v26.8h, v30.8h\n"
                         ".inst 0x6e5fef4b  // bfmmla v11.4s, v26.8h, v31.8h\n"
                         "ldr q26, [x20, 0x60]\n"
                         ".inst 0x6e5cef6c  // bfmmla v12.4s, v27.8h, v28.8h\n"
                         ".inst 0x6e5cef10  // bfmmla v16.4s, v24.8h, v28.8h\n"
                         ".inst 0x6e5cef34  // bfmmla v20.4s, v25.8h, v28.8h\n"
                         "ldr q28, [x21, 0x40]\n"
                         ".inst 0x6e5def6d  // bfmmla v13.4s, v27.8h, v29.8h\n"
                         ".inst 0x6e5eef6e  // bfmmla v14.4s, v27.8h, v30.8h\n"
                         ".inst 0x6e5fef6f  // bfmmla v15.4s, v27.8h, v31.8h\n"
                         "ldr q27, [x20, 0x70]\n"
                         ".inst 0x6e5def11  // bfmmla v17.4s, v24.8h, v29.8h\n"
                         ".inst 0x6e5def35  // bfmmla v21.4s, v25.8h, v29.8h\n"
                         "ldr q29, [x21, 0x50]\n"
                         ".inst 0x6e5eef12  // bfmmla v18.4s, v24.8h, v30.8h\n"
                         ".inst 0x6e5eef36  // bfmmla v22.4s, v25.8h, v30.8h\n"
                         "ldr q30, [x21, 0x60]\n"
                         ".inst 0x6e5fef13  // bfmmla v19.4s, v24.8h, v31.8h\n"
                         ".inst 0x6e5fef37  // bfmmla v23.4s, v25.8h, v31.8h\n"
                         "ldr q24, [x20, 0x80]\n"
                         "sub x22, x22, #2\n"

                         ".inst 0x6e5cef40  // bfmmla v0.4s, v26.8h, v28.8h\n"
                         ".inst 0x6e5cef64  // bfmmla v4.4s, v27.8h, v28.8h\n"
                         "ldr q31, [x21, 0x70]\n"
                         ".inst 0x6e5def41  // bfmmla v1.4s, v26.8h, v29.8h\n"
                         ".inst 0x6e5def65  // bfmmla v5.4s, v27.8h, v29.8h\n"
                         "ldr q25, [x20, 0x90]\n"
                         ".inst 0x6e5eef42  // bfmmla v2.4s, v26.8h, v30.8h\n"
                         ".inst 0x6e5eef66  // bfmmla v6.4s, v27.8h, v30.8h\n"
                         ".inst 0x6e5fef43  // bfmmla v3.4s, v26.8h, v31.8h\n"
                         "ldr q26, [x20, 0xa0]\n"
                         ".inst 0x6e5fef67  // bfmmla v7.4s, v27.8h, v31.8h\n"
                         ".inst 0x6e5cef08  // bfmmla v8.4s, v24.8h, v28.8h\n"
                         "ldr q27, [x20, 0xb0]\n"
                         ".inst 0x6e5def09  // bfmmla v9.4s, v24.8h, v29.8h\n"
                         ".inst 0x6e5eef0a  // bfmmla v10.4s, v24.8h, v30.8h\n"
                         ".inst 0x6e5fef0b  // bfmmla v11.4s, v24.8h, v31.8h\n"
                         "ldr q24, [x20, 0xc0]\n"
                         ".inst 0x6e5cef2c  // bfmmla v12.4s, v25.8h, v28.8h\n"
                         ".inst 0x6e5cef50  // bfmmla v16.4s, v26.8h, v28.8h\n"
                         ".inst 0x6e5cef74  // bfmmla v20.4s, v27.8h, v28.8h\n"
                         "ldr q28, [x21, 0x80]\n"
                         ".inst 0x6e5def2d  // bfmmla v13.4s, v25.8h, v29.8h\n"
                         ".inst 0x6e5eef2e  // bfmmla v14.4s, v25.8h, v30.8h\n"
                         ".inst 0x6e5fef2f  // bfmmla v15.4s, v25.8h, v31.8h\n"
                         "ldr q25, [x20, 0xd0]\n"
                         ".inst 0x6e5def51  // bfmmla v17.4s, v26.8h, v29.8h\n"
                         ".inst 0x6e5def75  // bfmmla v21.4s, v27.8h, v29.8h\n"
                         "ldr q29, [x21, 0x90]\n"
                         ".inst 0x6e5eef52  // bfmmla v18.4s, v26.8h, v30.8h\n"
                         ".inst 0x6e5eef76  // bfmmla v22.4s, v27.8h, v30.8h\n"
                         "ldr q30, [x21, 0xa0]\n"
                         ".inst 0x6e5fef53  // bfmmla v19.4s, v26.8h, v31.8h\n"
                         ".inst 0x6e5fef77  // bfmmla v23.4s, v27.8h, v31.8h\n"
                         "ldr q31, [x21, 0xb0]\n"
                         "add x20, x20, 0xc0\n"
                         "add x21, x21, 0x80\n"
                         "cmp x22, #1\n"
                         "bgt 0b\n"
                         "1:\n"
                         "bne 2f\n"
                         ".inst 0x6e5cef00  // bfmmla v0.4s, v24.8h, v28.8h\n"
                         ".inst 0x6e5def01  // bfmmla v1.4s, v24.8h, v29.8h\n"
                         ".inst 0x6e5cef24  // bfmmla v4.4s, v25.8h, v28.8h\n"
                         ".inst 0x6e5def25  // bfmmla v5.4s, v25.8h, v29.8h\n"
                         "ldr q26, [x20, 0x20]\n"
                         "ldr q27, [x20, 0x30]\n"
                         ".inst 0x6e5eef02  // bfmmla v2.4s, v24.8h, v30.8h\n"
                         ".inst 0x6e5fef03  // bfmmla v3.4s, v24.8h, v31.8h\n"
                         "ldr q24, [x20, 0x40]\n"
                         ".inst 0x6e5eef26  // bfmmla v6.4s, v25.8h, v30.8h\n"
                         ".inst 0x6e5fef27  // bfmmla v7.4s, v25.8h, v31.8h\n"
                         ".inst 0x6e5cef48  // bfmmla v8.4s, v26.8h, v28.8h\n"
                         "ldr q25, [x20, 0x50]\n"
                         ".inst 0x6e5def49  // bfmmla v9.4s, v26.8h, v29.8h\n"
                         ".inst 0x6e5eef4a  // bfmmla v10.4s, v26.8h, v30.8h\n"
                         ".inst 0x6e5fef4b  // bfmmla v11.4s, v26.8h, v31.8h\n"
                         ".inst 0x6e5cef6c  // bfmmla v12.4s, v27.8h, v28.8h\n"
                         ".inst 0x6e5cef10  // bfmmla v16.4s, v24.8h, v28.8h\n"
                         ".inst 0x6e5cef34  // bfmmla v20.4s, v25.8h, v28.8h\n"
                         ".inst 0x6e5def6d  // bfmmla v13.4s, v27.8h, v29.8h\n"
                         ".inst 0x6e5eef6e  // bfmmla v14.4s, v27.8h, v30.8h\n"
                         ".inst 0x6e5fef6f  // bfmmla v15.4s, v27.8h, v31.8h\n"
                         ".inst 0x6e5def11  // bfmmla v17.4s, v24.8h, v29.8h\n"
                         ".inst 0x6e5def35  // bfmmla v21.4s, v25.8h, v29.8h\n"
                         ".inst 0x6e5eef12  // bfmmla v18.4s, v24.8h, v30.8h\n"
                         ".inst 0x6e5eef36  // bfmmla v22.4s, v25.8h, v30.8h\n"
                         ".inst 0x6e5fef13  // bfmmla v19.4s, v24.8h, v31.8h\n"
                         ".inst 0x6e5fef37  // bfmmla v23.4s, v25.8h, v31.8h\n"

                         "2:\n"
                         : [A] "+r"(A), [B] "+r"(B), [C] "+r"(C)
                         : [K] "r"(K4), [offset] "r"(offset)
                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
                         "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
                         "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
                         "v30", "v31", "x19", "x20", "x21", "x22", "x26");
    if (sizeof(T) == 4) {
        __asm__ __volatile__("mov x26, %[C]\n"
                             "ld1 {v30.4s, v31.4s}, [x26]\n"
                             "uzp1 v24.2d, v0.2d, v1.2d\n"
                             "uzp1 v25.2d, v2.2d, v3.2d\n"
                             "uzp2 v26.2d, v0.2d, v1.2d\n"
                             "uzp2 v27.2d, v2.2d, v3.2d\n"
                             "uzp1 v28.2d, v4.2d, v5.2d\n"
                             "fadd v24.4s, v24.4s, v30.4s\n"
                             "fadd v25.4s, v25.4s, v31.4s\n"
                             "uzp2 v30.2d, v4.2d, v5.2d\n"
                             "uzp1 v29.2d, v6.2d, v7.2d\n"
                             "uzp2 v31.2d, v6.2d, v7.2d\n"
                             "st1 {v24.4s, v25.4s}, [x26]\n"
                             "add x26, x26, %[offset]\n"
                             "ld1 {v24.4s, v25.4s}, [x26]\n"
                             "uzp1 v0.2d, v8.2d, v9.2d\n"
                             "uzp2 v2.2d, v8.2d, v9.2d\n"
                             "fadd v26.4s, v26.4s, v24.4s\n"
                             "fadd v27.4s, v27.4s, v25.4s\n"
                             "uzp1 v1.2d, v10.2d, v11.2d\n"
                             "uzp2 v3.2d, v10.2d, v11.2d\n"
                             "uzp1 v4.2d, v12.2d, v13.2d\n"
                             "uzp2 v6.2d, v12.2d, v13.2d\n"
                             "st1 {v26.4s, v27.4s}, [x26]\n"
                             "add x26, x26, %[offset]\n"
                             "ld1 {v24.4s, v25.4s}, [x26]\n"
                             "uzp1 v5.2d, v14.2d, v15.2d\n"
                             "uzp2 v7.2d, v14.2d, v15.2d\n"
                             "uzp1 v8.2d, v16.2d, v17.2d\n"
                             "uzp2 v10.2d, v16.2d, v17.2d\n"
                             "fadd v28.4s, v28.4s, v24.4s\n"
                             "fadd v29.4s, v29.4s, v25.4s\n"
                             "uzp1 v9.2d, v18.2d, v19.2d\n"
                             "uzp2 v11.2d, v18.2d, v19.2d\n"
                             "uzp1 v12.2d, v20.2d, v21.2d\n"
                             "uzp2 v14.2d, v20.2d, v21.2d\n"
                             "st1 {v28.4s, v29.4s}, [x26]\n"
                             "add x26, x26, %[offset]\n"
                             "ld1 {v24.4s, v25.4s}, [x26]\n"
                             "uzp1 v13.2d, v22.2d, v23.2d\n"
                             "uzp2 v15.2d, v22.2d, v23.2d\n"
                             "fadd v30.4s, v30.4s, v24.4s\n"
                             "fadd v31.4s, v31.4s, v25.4s\n"
                             "st1 {v30.4s, v31.4s}, [x26]\n"
                             "add x26, x26, %[offset]\n"

                             "ld1 {v24.4s, v25.4s}, [x26]\n"
                             "fadd v0.4s, v0.4s, v24.4s\n"
                             "fadd v1.4s, v1.4s, v25.4s\n"
                             "st1 { v0.4s,  v1.4s}, [x26]\n"
                             "add x26, x26, %[offset]\n"

                             "ld1 {v24.4s, v25.4s}, [x26]\n"
                             "fadd v2.4s, v2.4s, v24.4s\n"
                             "fadd v3.4s, v3.4s, v25.4s\n"
                             "st1 { v2.4s,  v3.4s}, [x26]\n"
                             "add x26, x26, %[offset]\n"

                             "ld1 {v24.4s, v25.4s}, [x26]\n"
                             "fadd v4.4s, v4.4s, v24.4s\n"
                             "fadd v5.4s, v5.4s, v25.4s\n"
                             "st1 { v4.4s,  v5.4s}, [x26]\n"
                             "add x26, x26, %[offset]\n"

                             "ld1 {v24.4s, v25.4s}, [x26]\n"
                             "fadd v6.4s, v6.4s, v24.4s\n"
                             "fadd v7.4s, v7.4s, v25.4s\n"
                             "st1 { v6.4s,  v7.4s}, [x26]\n"
                             "add x26, x26, %[offset]\n"

                             "ld1 {v24.4s, v25.4s}, [x26]\n"
                             "fadd v8.4s, v8.4s, v24.4s\n"
                             "fadd v9.4s, v9.4s, v25.4s\n"
                             "st1 { v8.4s,  v9.4s}, [x26]\n"
                             "add x26, x26, %[offset]\n"

                             "ld1 {v24.4s, v25.4s}, [x26]\n"
                             "fadd v10.4s, v10.4s, v24.4s\n"
                             "fadd v11.4s, v11.4s, v25.4s\n"
                             "st1 {v10.4s, v11.4s}, [x26]\n"
                             "add x26, x26, %[offset]\n"

                             "ld1 {v24.4s, v25.4s}, [x26]\n"
                             "fadd v12.4s, v12.4s, v24.4s\n"
                             "fadd v13.4s, v13.4s, v25.4s\n"
                             "st1 {v12.4s, v13.4s}, [x26]\n"
                             "add x26, x26, %[offset]\n"

                             "ld1 {v24.4s, v25.4s}, [x26]\n"
                             "fadd v14.4s, v14.4s, v24.4s\n"
                             "fadd v15.4s, v15.4s, v25.4s\n"
                             "st1 {v14.4s, v15.4s}, [x26]\n"
                             : [A] "+r"(A), [B] "+r"(B), [C] "+r"(C)
                             : [K] "r"(K4), [offset] "r"(offset)
                             : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
                             "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
                             "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
                             "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x26");
    } else {
        __asm__ __volatile__("mov x26, %[C]\n"
                             "ldr q29, [x26]\n"
                             "fcvtn v24.4h, v0.4s\n"
                             "fcvtn v30.4h, v1.4s\n"
                             "mov v24.d[1], v30.d[0]\n"
                             "fcvtn v25.4h, v2.4s\n"
                             "fcvtn v31.4h, v3.4s\n"
                             "mov v25.d[1], v31.d[0]\n"
                             "uzp1 v0.4s, v24.4s, v25.4s\n"
                             "uzp2 v1.4s, v24.4s, v25.4s\n"

                             "fcvtn v26.4h, v4.4s\n"
                             "fcvtn v30.4h, v5.4s\n"
                             "mov v26.d[1], v30.d[0]\n"
                             "fadd v0.8h, v0.8h, v29.8h\n"
                             "fcvtn v27.4h, v6.4s\n"
                             "fcvtn v31.4h, v7.4s\n"
                             "mov v27.d[1], v31.d[0]\n"
                             "str q0, [x26]\n"
                             "add x26, x26, %[offset]\n"
                             "ldr q29, [x26]\n"
                             "uzp1 v4.4s, v26.4s, v27.4s\n"
                             "uzp2 v5.4s, v26.4s, v27.4s\n"
                             "fadd v1.8h, v1.8h, v29.8h\n"
                             "str q1, [x26]\n"
                             "add x26, x26, %[offset]\n"
                             "ldr q29, [x26]\n"

                             "fcvtn v24.4h, v8.4s\n"
                             "fcvtn v30.4h, v9.4s\n"
                             "mov v24.d[1], v30.d[0]\n"
                             "fadd v4.8h, v4.8h, v29.8h\n"
                             "fcvtn v25.4h, v10.4s\n"
                             "fcvtn v31.4h, v11.4s\n"
                             "mov v25.d[1], v31.d[0]\n"
                             "str q4, [x26]\n"
                             "add x26, x26, %[offset]\n"
                             "ldr q29, [x26]\n"
                             "uzp1 v8.4s, v24.4s, v25.4s\n"
                             "uzp2 v9.4s, v24.4s, v25.4s\n"
                             "fadd v5.8h, v5.8h, v29.8h\n"
                             "str q5, [x26]\n"
                             "add x26, x26, %[offset]\n"
                             "ldr q29, [x26]\n"

                             "fcvtn v26.4h, v12.4s\n"
                             "fcvtn v30.4h, v13.4s\n"
                             "mov v26.d[1], v30.d[0]\n"
                             "fadd v8.8h, v8.8h, v29.8h\n"
                             "fcvtn v27.4h, v14.4s\n"
                             "fcvtn v31.4h, v15.4s\n"
                             "mov v27.d[1], v31.d[0]\n"
                             "str q8, [x26]\n"
                             "add x26, x26, %[offset]\n"
                             "ldr q29, [x26]\n"
                             "uzp1 v12.4s, v26.4s, v27.4s\n"
                             "uzp2 v13.4s, v26.4s, v27.4s\n"
                             "fadd v9.8h, v9.8h, v29.8h\n"
                             "str q9, [x26]\n"
                             "add x26, x26, %[offset]\n"
                             "ldr q29, [x26]\n"

                             "fcvtn v24.4h, v16.4s\n"
                             "fcvtn v30.4h, v17.4s\n"
                             "mov v24.d[1], v30.d[0]\n"
                             "fadd v12.8h, v12.8h, v29.8h\n"
                             "fcvtn v25.4h, v18.4s\n"
                             "fcvtn v31.4h, v19.4s\n"
                             "mov v25.d[1], v31.d[0]\n"
                             "str q12, [x26]\n"
                             "add x26, x26, %[offset]\n"
                             "ldr q29, [x26]\n"
                             "uzp1 v16.4s, v24.4s, v25.4s\n"
                             "uzp2 v17.4s, v24.4s, v25.4s\n"
                             "fadd v13.8h, v13.8h, v29.8h\n"
                             "str q13, [x26]\n"
                             "add x26, x26, %[offset]\n"
                             "ldr q29, [x26]\n"

                             "fcvtn v26.4h, v20.4s\n"
                             "fcvtn v30.4h, v21.4s\n"
                             "mov v26.d[1], v30.d[0]\n"
                             "fadd v16.8h, v16.8h, v29.8h\n"
                             "fcvtn v27.4h, v22.4s\n"
                             "fcvtn v31.4h, v23.4s\n"
                             "mov v27.d[1], v31.d[0]\n"
                             "str q16, [x26]\n"
                             "add x26, x26, %[offset]\n"
                             "ldr q29, [x26]\n"
                             "uzp1 v20.4s, v26.4s, v27.4s\n"
                             "uzp2 v21.4s, v26.4s, v27.4s\n"
                             "fadd v17.8h, v17.8h, v29.8h\n"
                             "str q17, [x26]\n"
                             "add x26, x26, %[offset]\n"
                             "ldr q29, [x26]\n"

                             "fadd v20.8h, v20.8h, v29.8h\n"
                             "str q20, [x26]\n"
                             "add x26, x26, %[offset]\n"
                             "ldr q29, [x26]\n"
                             "fadd v21.8h, v21.8h, v29.8h\n"
                             "str q21, [x26]\n"
                             : [A] "+r"(A), [B] "+r"(B), [C] "+r"(C)
                             : [K] "r"(K4), [offset] "r"(offset)
                             : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
                             "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
                             "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
                             "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x26");
    }
}
#endif
